Here I showcase of to use BSWiMS feature selection/modeling function coupled with Goal Driven Sparse Transformation Matrix (UPSTM) as a pre-processing step to decorrelate highly correlated features. The aim(s) are:
To improve model performance by uncovering the hidden information between correlated features.
To simplify the interpretation of the machine learning models.
This demo will use:
FRESA.CAD::IDeA(). For Decorrelation of Multidimensional data sets
heatmap.2(). For displaying the correlation matrix
library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)
op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)
Data Source https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6960825/
Mohino-Herranz I, Gil-Pita R, Rosa-Zurera M, Seoane F. Activity Recognition Using Wearable Physiological Measurements: Selection of Features from a Comprehensive Literature Study. Sensors (Basel). 2019 Dec 13;19(24):0. doi: 10.3390/s19245524. PMID: 31847261; PMCID: PMC6960825.
Activitydata <- read.csv("~/GitHub/LatentBiomarkers/Data/ActivityData/data.txt", header=FALSE, stringsAsFactors=TRUE)
featNames <- read.table("~/GitHub/LatentBiomarkers/Data/ActivityData/Featurelabels.txt", quote="\"", comment.char="")
featNames <- as.character(t(featNames))
featNames <- str_replace_all(featNames,"\\(abs\\)","_A_")
featNames[duplicated(featNames)] <- paste(featNames[duplicated(featNames)],"D",sep="_")
rep_ID <- numeric(nrow(Activitydata))
ctr <- 1
for (ind in c(1:(nrow(Activitydata)-1)))
{
rep_ID[ind] <- ctr
if (Activitydata$V1[ind] != Activitydata$V1[ind+1]) ctr <- 0;
ctr <- ctr + 1
}
rownames(Activitydata) <- paste(Activitydata$V1,rep_ID,sep="_")
colnames(Activitydata) <- c("ID",featNames,"class")
Activitydata$rep <- rep_ID
tb <- table(Activitydata$class)
classes <- c("Neu","Emo","Men","Phy")
names(classes) <- names(tb)
Activitydata$class <- classes[as.character(Activitydata$class)]
table(Activitydata$class)
#>
#> Emo Men Neu Phy
#> 1120 1120 1120 1120
#ID_class <- paste(Activitydata$ID,Activitydata$class)
#Activitydata <- Activitydata[!duplicated(ID_class),]
table(Activitydata$class)
#>
#> Emo Men Neu Phy
#> 1120 1120 1120 1120
nsub <- unique(Activitydata$ID)
trainFraction <- 0.6
trainSample <- sample(length(nsub),length(nsub)*trainFraction)
trainDataFrame <- Activitydata[Activitydata$ID %in% nsub[trainSample],]
testDataFrame <- Activitydata[Activitydata$ID %in% nsub[-trainSample],]
trainDataFrame <- trainDataFrame[,c(featNames,"class")]
testDataFrame <- testDataFrame[,c(featNames,"class")]
sdiszero <- c(apply(trainDataFrame[,featNames],2,var) > 1.0e-16,TRUE)
trainDataFrame <- trainDataFrame[,sdiszero]
testDataFrame <- testDataFrame[,sdiszero]
varlist <- colnames(trainDataFrame)
varlist <- varlist[varlist != "class"]
tokeep <- c(as.character(correlated_Remove(trainDataFrame,varlist,thr=0.9999)),"class")
trainDataFrame <- trainDataFrame[,tokeep]
testDataFrame <- testDataFrame[,tokeep]
trainScale <- FRESAScale(trainDataFrame,method="OrderLogit")
trainDataFrame <- trainScale$scaledData
table(trainDataFrame$class)
#>
#> Emo Men Neu Phy
#> 672 672 672 672
testDataFrame <- FRESAScale(testDataFrame,method="OrderLogit",refMean=trainScale$refMean,refDisp=trainScale$refDisp)$scaledData
table(testDataFrame$class)
#>
#> Emo Men Neu Phy
#> 448 448 448 448
outcome <- "class"
trainFraction <- 0.7
rhoThreshold <- 0.6
TopVariables <- 10
aucTHR <- 0.75
#set.seed(5)
#trainSample <- sample(nrow(dataframe),nrow(dataframe)*trainFraction)
#trainDataFrame <- dataframe[trainSample,]
#testDataFrame <- dataframe[-trainSample,]
pander::pander(table(trainDataFrame[,outcome]))
| Emo | Men | Neu | Phy |
|---|---|---|---|
| 672 | 672 | 672 | 672 |
pander::pander(table(testDataFrame[,outcome]))
| Emo | Men | Neu | Phy |
|---|---|---|---|
| 448 | 448 | 448 | 448 |
varlist <- colnames(trainDataFrame)
varlist <- varlist[varlist != outcome]
hmdf <- testDataFrame;
hmdf$class <- as.numeric(as.factor(testDataFrame$class))
hm <- heatMaps(data=hmdf,
Outcome="class",
Scale=FALSE,
hCluster = "row",
xlab="Feature",
ylab="Sample",
cexCol=0.15,
cexRow=0.25
)
par(op)
raincolors <- rainbow(length(classes))
names(raincolors) <- classes
datasetframe.umap = umap(trainDataFrame[,varlist],n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=trainDataFrame$class,col=raincolors[trainDataFrame$class])
trainDecor <- IDeA(trainDataFrame,thr=rhoThreshold,verbose=TRUE)
#>
#> Included: 323 , Uni p: 0.01536768 , Uncorrelated Base: 54 , Outcome-Driven Size: 0 , Base Size: 54
#>
#>
1 <R=1.000,w= 1,N= 209>, Top: 47( 10 )[ 1 : 47 : 0.975 ]( 46 , 151 , 0 ),<|>Tot Used: 197 , Added: 151 , Zero Std: 0 , Max Cor: 1.000
#>
2 <R=1.000,w= 1,N= 209>, Top: 14( 3 )[ 1 : 14 : 0.975 ]( 14 , 27 , 46 ),<|>Tot Used: 202 , Added: 27 , Zero Std: 0 , Max Cor: 1.000
#>
3 <R=1.000,w= 1,N= 209>, Top: 1( 1 )[ 1 : 1 : 0.975 ]( 1 , 1 , 60 ),<|>Tot Used: 202 , Added: 1 , Zero Std: 0 , Max Cor: 0.974
#>
4 <R=0.974,w= 2,N= 75>, Top: 30( 2 )[ 1 : 30 : 0.937 ]( 30 , 38 , 61 ),<|>Tot Used: 234 , Added: 38 , Zero Std: 0 , Max Cor: 0.983
#>
5 <R=0.983,w= 2,N= 75>, Top: 5( 1 )[ 1 : 5 : 0.941 ]( 5 , 5 , 75 ),<|>Tot Used: 234 , Added: 5 , Zero Std: 0 , Max Cor: 0.935
#>
6 <R=0.935,w= 3,N= 84>, Top: 33( 1 )[ 1 : 33 : 0.868 ]( 33 , 43 , 79 ),<|>Tot Used: 257 , Added: 43 , Zero Std: 0 , Max Cor: 0.951
#>
7 <R=0.951,w= 3,N= 84>, Top: 4( 1 )[ 1 : 4 : 0.875 ]( 4 , 5 , 96 ),<|>Tot Used: 257 , Added: 5 , Zero Std: 0 , Max Cor: 0.867
#>
8 <R=0.867,w= 4,N= 85>, Top: 34( 1 )[ 1 : 34 : 0.784 ]( 34 , 43 , 99 ),<|>Tot Used: 270 , Added: 43 , Zero Std: 0 , Max Cor: 0.946
#>
9 <R=0.946,w= 4,N= 85>, Top: 4( 1 )[ 1 : 4 : 0.823 ]( 4 , 4 , 116 ),<|>Tot Used: 270 , Added: 4 , Zero Std: 0 , Max Cor: 0.929
#>
10 <R=0.929,w= 5,N= 85>, Top: 38( 1 )[ 1 : 38 : 0.715 ]( 37 , 46 , 119 ),<|>Tot Used: 284 , Added: 46 , Zero Std: 0 , Max Cor: 0.986
#>
11 <R=0.986,w= 5,N= 85>, Top: 5( 1 )[ 1 : 5 : 0.743 ]( 5 , 5 , 139 ),<|>Tot Used: 285 , Added: 5 , Zero Std: 0 , Max Cor: 0.735
#>
12 <R=0.735,w= 6,N= 104>, Top: 44( 3 )[ 1 : 44 : 0.600 ]( 43 , 51 , 140 ),<|>Tot Used: 295 , Added: 51 , Zero Std: 0 , Max Cor: 0.677
#>
13 <R=0.677,w= 6,N= 104>, Top: 8( 1 )[ 1 : 8 : 0.600 ]( 7 , 8 , 150 ),<|>Tot Used: 295 , Added: 8 , Zero Std: 0 , Max Cor: 0.599
#>
14 <R=0.000,w= 7,N= 0>
#>
[ 14 ], 0.5991722 Decor Dimension: 295 . Cor to Base: 167 , ABase: 32 , Outcome Base: 0
#>
testDecor <- predictDecorrelate(trainDecor,testDataFrame)
varlistc <- colnames(testDecor)[colnames(testDecor) != outcome]
pander::pander(sum(apply(testDataFrame[,varlist],2,var)))
592
pander::pander(sum(apply(testDecor[,varlistc],2,var)))
168
pander::pander(entropy(discretize(unlist(testDataFrame[,varlist]), 256)))
4.88
pander::pander(entropy(discretize(unlist(testDecor[,varlistc]), 256)))
2.85
cormat <- cor(testDecor[,varlistc],method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Correlation after IDeA",
cexRow = 0.15,
cexCol = 0.15,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
par(op)